# ==============================================================================
# file name: 01-identify-survey-hosts-LU.sql
# date:	Nov 24, 2022
# author: Bernhard Clemm 
# purpose: Identify hosts of survey sites (Lucid) with three approaches:
#          (1) questionnaire tools from Bevec & Domen (2021)
#          (2) all URL hosts containing "survey"
#          (3) manually coding most-visited 500 URL hosts not already in (1)/(2)
# THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# (1) Bevec & Vehovar (2021) =====================================================

# Match URL hosts in data to patterns and create list of unique hosts
# see 00-create-regex-bevec-vehovar.R for how match pattern is created
# match pattern is copy-pasted from "bevec_url_matches_patterns.txt"

create table hosts_bevec_LUCID as
select url_host, count() ct
from visits_clean_url
where country = 'US' and regexp_like(url_host, '^form.com$|^submit.com$|^submit.link$|^formsite.com$|^formdesk.com$|^typeform.com$|^perfectforms.com$|^jotform.com$|^formassembly.com$|^tfaforms.net$|^emailmeform.com$|^wufoo.com$|^docs.google.com/forms$|^mightyforms.com$|^nestforms.com$|^123formbuilder.com$|^fastfieldforms.com$|^forms.office.com$|^paperform.co$|^aidaform.com$|^forms.app$|^novaresearch.com$|^blaise.com$|^infowiz-umfragen.ch$|^riddle.com$|^nebu.com$|^surveypro.com$|^dynata.com$|^cmix.com$|^nipo.com$|^nfieldmr.com$|^tns-nipo.com$|^surveysystem.com$|^snapsurveys.com$|^getfeedback.com$|^objectplanet.com$|^popsurvey.com$|^checkbox.com$|^datstat.com$|^surveyconnect.com$|^confirmit.com$|^infosurv.com$|^instantsurvey.com$|^freepolls.com$|^surveymonkey.com$|^surveyplanet.com$|^createsurvey.com$|^surveyfactory.com$|^intellisurvey.com$|^voxco.com$|^questback.com$|^enalyzer.com$|^questionpro.com$|^dash.ca$|^askia.com$|^avius.com$|^webropol.com$|^webropolsurveys.com$|^learningbridge.com$|^surveyshare.com$|^keysurvey.com$|^aussurveys.com$|^plotto.com$|^quantilope.com$|^survey-maker.com$|^supersurvey.com$|^checkmarket.com$|^zipsurvey.com$|^surveygalaxy.com$|^surveylab.com$|^mysurveylab.com$|^feedbackloop.com$|^selectsurvey.net$|^zarca.com$|^amplituderesearch.com$|^amplitude.com$|^surveywriter.net$|^simplesurvey.com$|^surveymonster.com$|^psychdata.com$|^survio.com$|^surveyanalytics.com$|^2ask.net$|^survey.websitegear.com$|^surveymanager.com.au$|^jambo-software.com$|^peoplepulse.com$|^digivey.com$|^vista-survey.com$|^papayapolls.com$|^encuestafacil.com$|^grapevinesurveys.com$|^sogosurvey.com$|^surveybox.co.uk$|^warpit.net$|^ezysurvey.com$|^star360feedback.com$|^unipark.com$|^unipark.de$|^directpoll.com$|^netigate.net$|^netigate.se$|^surveyhero.com$|^crowdsignal.com$|^poll.fm$|^surveymethods.com$|^visionmetrics.net$|^tailorbuilder.com$|^micropoll.com$|^thesistools.com$|^thesistoolspro.com$|^survs.com$|^limesurvey.org$|^encuesta.com$|^esurveyspro.com$|^questionstar.com$|^opiniator.com$|^riddlemethis.net$|^kwiksurveys.com$|^freeonlinesurveys.com$|^surveypie.com$|^novisurvey.net$|^beyondfeedback.com$|^magicsurveytool.com$|^obsurvey.com$|^interceptum.com$|^polleverywhere.com$|^pollev.com$|^1ka.si$|^websurveycreator.com$|^mysurvs.com$|^surveyact.com$|^evalandgo.com$|^smartsurvey.co.uk$|^qualifio.com$|^qualifioapp.com$|^pinnion.com$|^easypolls.net$|^surveyonics.com$|^surveylegend.com$|^sparkchart.com$|^idsurvey.com$|^onepointglobal.com$|^onepointsurveys.com$|^smpsurveys.com$|^soorvey.com$|^trainingcheck.com$|^nicereply.com$|^customerthermometer.com$|^onlinesurveys.ac.uk$|^quicktapsurvey.com$|^formstack.com$|^ealicia.com$|^surveygoo.com$|^infogram.com$|^sutisurvey.com$|^suggestar.com$|^surveycto.com$|^surveyrock.com$|^tabsurvey.com$|^supersimplesurvey.com$|^surveyface.com$|^surveycrest.com$|^pollfish.com$|^jibunu.com$|^newlio.com$|^qualaroo.com$|^surveyproject.org$|^surveyproject.net$|^surveyanyplace.com$|^pointerpro.com$|^izisurvey.com$|^loopsurvey.com$|^surveykiwi.com$|^survicate.com$|^delighted.com$|^surveynuts.com$|^zonkafeedback.com$|^zonka.co$|^qpointsurvey.com$|^dragnsurvey.com$|^harvestyourdata.com$|^qeryz.com$|^refiner.io$|^pinpoll.com$|^centiment.co$|^getfoureyes.com$|^inquisium.com$|^informizely.com$|^survtapp.com$|^surveysparrow.com$|^startquestion.com$|^primosurvey.com$|^rogator.de$|^yesinsights.com$|^customer.guru$|^thinksurvey.co$|^ngsurvey.com$|^talk2us.io$|^askarbit.com$|^surveyfoxy.com$|^surveyocean.com$|^surveyjs.io$|^youengage.me$|^tolunastart.com$|^toluna.com$|^tolunainsights.com$|^surveyvista.com$|^phonic.ai$|^blocksurvey.io$|^surveysensum.com$|^alida.com$|^sawtooth.com$|^reputation.com$|^syberworks.com$|^sawtoothsoftware.com$|^servicenow.com$|^suzy.com$|^astutesolutions.com$|^focusvision.com$|^forsta.com$|^kahootz.com$|^nfocus.com$|^pisano.com$|^zondera.com$|^activecampaign.com$|^qualtrics.com$|^artologik.com$|^verint.com$|^iscripts.com$|^remarksoftware.com$|^nextiva.com$|^centercode.com$|^turnfriendly.com$|^explorance.com$|^tendenci.com$|^inmoment.com$|^survey.zoho.com$|^survey.zohopublic.eu$|^survey.zohopublic.com$|^forms.zohopublic.com$|^forms.zohopublic.eu$|^forms.zoho.com$|^involve.me$|^proprofs.com$|^examinare.com$|^quantumworkplace.com$|^4screens.net$|^greenrope.com$|^conjointly.com$|^conjoint.ly$|^pabbly.com$|^catglobe.com$|^spotler.co.uk$|^spotler.com$|^plumvoice.com$|^alchemer.com$|^surveygizmo.com$|^mysurveygizmo.com$|^marugroup.net$|^marusurveys.com$|^maruhub.com$|^qone-tech.com$|^outgrow.co$|^outgrow.us$|^upwave.io$|^omniconvert.com$|\.form.com$|\.submit.com$|\.submit.link$|\.formsite.com$|\.formdesk.com$|\.typeform.com$|\.perfectforms.com$|\.jotform.com$|\.formassembly.com$|\.tfaforms.net$|\.emailmeform.com$|\.wufoo.com$|\.docs.google.com/forms$|\.mightyforms.com$|\.nestforms.com$|\.123formbuilder.com$|\.fastfieldforms.com$|\.forms.office.com$|\.paperform.co$|\.aidaform.com$|\.forms.app$|\.novaresearch.com$|\.blaise.com$|\.infowiz-umfragen.ch$|\.riddle.com$|\.nebu.com$|\.surveypro.com$|\.dynata.com$|\.cmix.com$|\.nipo.com$|\.nfieldmr.com$|\.tns-nipo.com$|\.surveysystem.com$|\.snapsurveys.com$|\.getfeedback.com$|\.objectplanet.com$|\.popsurvey.com$|\.checkbox.com$|\.datstat.com$|\.surveyconnect.com$|\.confirmit.com$|\.infosurv.com$|\.instantsurvey.com$|\.freepolls.com$|\.surveymonkey.com$|\.surveyplanet.com$|\.createsurvey.com$|\.surveyfactory.com$|\.intellisurvey.com$|\.voxco.com$|\.questback.com$|\.enalyzer.com$|\.questionpro.com$|\.dash.ca$|\.askia.com$|\.avius.com$|\.webropol.com$|\.webropolsurveys.com$|\.learningbridge.com$|\.surveyshare.com$|\.keysurvey.com$|\.aussurveys.com$|\.plotto.com$|\.quantilope.com$|\.survey-maker.com$|\.supersurvey.com$|\.checkmarket.com$|\.zipsurvey.com$|\.surveygalaxy.com$|\.surveylab.com$|\.mysurveylab.com$|\.feedbackloop.com$|\.selectsurvey.net$|\.zarca.com$|\.amplituderesearch.com$|\.amplitude.com$|\.surveywriter.net$|\.simplesurvey.com$|\.surveymonster.com$|\.psychdata.com$|\.survio.com$|\.surveyanalytics.com$|\.2ask.net$|\.survey.websitegear.com$|\.surveymanager.com.au$|\.jambo-software.com$|\.peoplepulse.com$|\.digivey.com$|\.vista-survey.com$|\.papayapolls.com$|\.encuestafacil.com$|\.grapevinesurveys.com$|\.sogosurvey.com$|\.surveybox.co.uk$|\.warpit.net$|\.ezysurvey.com$|\.star360feedback.com$|\.unipark.com$|\.unipark.de$|\.directpoll.com$|\.netigate.net$|\.netigate.se$|\.surveyhero.com$|\.crowdsignal.com$|\.poll.fm$|\.surveymethods.com$|\.visionmetrics.net$|\.tailorbuilder.com$|\.micropoll.com$|\.thesistools.com$|\.thesistoolspro.com$|\.survs.com$|\.limesurvey.org$|\.encuesta.com$|\.esurveyspro.com$|\.questionstar.com$|\.opiniator.com$|\.riddlemethis.net$|\.kwiksurveys.com$|\.freeonlinesurveys.com$|\.surveypie.com$|\.novisurvey.net$|\.beyondfeedback.com$|\.magicsurveytool.com$|\.obsurvey.com$|\.interceptum.com$|\.polleverywhere.com$|\.pollev.com$|\.1ka.si$|\.websurveycreator.com$|\.mysurvs.com$|\.surveyact.com$|\.evalandgo.com$|\.smartsurvey.co.uk$|\.qualifio.com$|\.qualifioapp.com$|\.pinnion.com$|\.easypolls.net$|\.surveyonics.com$|\.surveylegend.com$|\.sparkchart.com$|\.idsurvey.com$|\.onepointglobal.com$|\.onepointsurveys.com$|\.smpsurveys.com$|\.soorvey.com$|\.trainingcheck.com$|\.nicereply.com$|\.customerthermometer.com$|\.onlinesurveys.ac.uk$|\.quicktapsurvey.com$|\.formstack.com$|\.ealicia.com$|\.surveygoo.com$|\.infogram.com$|\.sutisurvey.com$|\.suggestar.com$|\.surveycto.com$|\.surveyrock.com$|\.tabsurvey.com$|\.supersimplesurvey.com$|\.surveyface.com$|\.surveycrest.com$|\.pollfish.com$|\.jibunu.com$|\.newlio.com$|\.qualaroo.com$|\.surveyproject.org$|\.surveyproject.net$|\.surveyanyplace.com$|\.pointerpro.com$|\.izisurvey.com$|\.loopsurvey.com$|\.surveykiwi.com$|\.survicate.com$|\.delighted.com$|\.surveynuts.com$|\.zonkafeedback.com$|\.zonka.co$|\.qpointsurvey.com$|\.dragnsurvey.com$|\.harvestyourdata.com$|\.qeryz.com$|\.refiner.io$|\.pinpoll.com$|\.centiment.co$|\.getfoureyes.com$|\.inquisium.com$|\.informizely.com$|\.survtapp.com$|\.surveysparrow.com$|\.startquestion.com$|\.primosurvey.com$|\.rogator.de$|\.yesinsights.com$|\.customer.guru$|\.thinksurvey.co$|\.ngsurvey.com$|\.talk2us.io$|\.askarbit.com$|\.surveyfoxy.com$|\.surveyocean.com$|\.surveyjs.io$|\.youengage.me$|\.tolunastart.com$|\.toluna.com$|\.tolunainsights.com$|\.surveyvista.com$|\.phonic.ai$|\.blocksurvey.io$|\.surveysensum.com$|\.alida.com$|\.sawtooth.com$|\.reputation.com$|\.syberworks.com$|\.sawtoothsoftware.com$|\.servicenow.com$|\.suzy.com$|\.astutesolutions.com$|\.focusvision.com$|\.forsta.com$|\.kahootz.com$|\.nfocus.com$|\.pisano.com$|\.zondera.com$|\.activecampaign.com$|\.qualtrics.com$|\.artologik.com$|\.verint.com$|\.iscripts.com$|\.remarksoftware.com$|\.nextiva.com$|\.centercode.com$|\.turnfriendly.com$|\.explorance.com$|\.tendenci.com$|\.inmoment.com$|\.survey.zoho.com$|\.survey.zohopublic.eu$|\.survey.zohopublic.com$|\.forms.zohopublic.com$|\.forms.zohopublic.eu$|\.forms.zoho.com$|\.involve.me$|\.proprofs.com$|\.examinare.com$|\.quantumworkplace.com$|\.4screens.net$|\.greenrope.com$|\.conjointly.com$|\.conjoint.ly$|\.pabbly.com$|\.catglobe.com$|\.spotler.co.uk$|\.spotler.com$|\.plumvoice.com$|\.alchemer.com$|\.surveygizmo.com$|\.mysurveygizmo.com$|\.marugroup.net$|\.marusurveys.com$|\.maruhub.com$|\.qone-tech.com$|\.outgrow.co$|\.outgrow.us$|\.upwave.io$|\.omniconvert.com$')
group by url_host
order by ct desc
-- this table is used in the query in 02-summarize-survey-visits-LU.sql

# (2) URL hosts containing "survey" ============================================

create table hosts_survey_LUCID as
select url_host, count() ct
from visits_clean_url
where country = 'US' 
and regexp_like(url_host, 'survey') 
and url_host not in (select url_host from hosts_bevec_LUCID)
group by url_host
order by ct desc
-- this table is used in the query in 02-summarize-survey-visits-LU.sql

# (3) most-visited 500 =========================================================

select * from 
(select url_host, count() ct
from visits_clean_url
where country = 'US' 
and url_host not in (select url_host from hosts_bevec_LUCID)
and url_host not in (select url_host from hosts_survey_LUCID)
group by url_host
order by ct desc)
limit 500
# this set is exported as hosts_500_FB_coded.csv and manually coded



